{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from os import listdir\n",
    "from numpy import array\n",
    "from keras.preprocessing.text import Tokenizer, one_hot\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Model, Sequential, model_from_json\n",
    "from keras.utils import to_categorical\n",
    "from keras.layers.core import Dense, Dropout, Flatten\n",
    "from keras.optimizers import RMSprop\n",
    "from keras.layers.convolutional import Conv2D\n",
    "from keras.callbacks import ModelCheckpoint\n",
    "from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, concatenate , Input, Reshape, Dense\n",
    "from keras.preprocessing.image import array_to_img, img_to_array, load_img\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dir_name = '/data/train/'\n",
    "\n",
    "# Read a file and return a string\n",
    "def load_doc(filename):\n",
    "    file = open(filename, 'r')\n",
    "    text = file.read()\n",
    "    file.close()\n",
    "    return text\n",
    "\n",
    "def load_data(data_dir):\n",
    "    text = []\n",
    "    images = []\n",
    "    # Load all the files and order them\n",
    "    all_filenames = listdir(data_dir)\n",
    "    all_filenames.sort()\n",
    "    for filename in (all_filenames):\n",
    "        if filename[-3:] == \"npz\":\n",
    "            # Load the images already prepared in arrays\n",
    "            image = np.load(data_dir+filename)\n",
    "            images.append(image['features'])\n",
    "        else:\n",
    "            # Load the boostrap tokens and rap them in a start and end tag\n",
    "            syntax = '<START> ' + load_doc(data_dir+filename) + ' <END>'\n",
    "            # Seperate all the words with a single space\n",
    "            syntax = ' '.join(syntax.split())\n",
    "            # Add a space after each comma\n",
    "            syntax = syntax.replace(',', ' ,')\n",
    "            text.append(syntax)\n",
    "    images = np.array(images, dtype=float)\n",
    "    return images, text\n",
    "\n",
    "train_features, texts = load_data(dir_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize the function to create the vocabulary \n",
    "tokenizer = Tokenizer(filters='', split=\" \", lower=False)\n",
    "# Create the vocabulary \n",
    "tokenizer.fit_on_texts([load_doc('bootstrap.vocab')])\n",
    "\n",
    "# Add one spot for the empty word in the vocabulary \n",
    "vocab_size = len(tokenizer.word_index) + 1\n",
    "# Map the input sentences into the vocabulary indexes\n",
    "train_sequences = tokenizer.texts_to_sequences(texts)\n",
    "# The longest set of boostrap tokens\n",
    "max_sequence = max(len(s) for s in train_sequences)\n",
    "# Specify how many tokens to have in each input sentence\n",
    "max_length = 48\n",
    "\n",
    "def preprocess_data(sequences, features):\n",
    "    X, y, image_data = list(), list(), list()\n",
    "    for img_no, seq in enumerate(sequences):\n",
    "        for i in range(1, len(seq)):\n",
    "            # Add the sentence until the current count(i) and add the current count to the output\n",
    "            in_seq, out_seq = seq[:i], seq[i]\n",
    "            # Pad all the input token sentences to max_sequence\n",
    "            in_seq = pad_sequences([in_seq], maxlen=max_sequence)[0]\n",
    "            # Turn the output into one-hot encoding\n",
    "            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]\n",
    "            # Add the corresponding image to the boostrap token file\n",
    "            image_data.append(features[img_no])\n",
    "            # Cap the input sentence to 48 tokens and add it\n",
    "            X.append(in_seq[-48:])\n",
    "            y.append(out_seq)\n",
    "    return np.array(X), np.array(y), np.array(image_data)\n",
    "\n",
    "X, y, image_data = preprocess_data(train_sequences, train_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Create the encoder\n",
    "image_model = Sequential()\n",
    "image_model.add(Conv2D(16, (3, 3), padding='valid', activation='relu', input_shape=(256, 256, 3,)))\n",
    "image_model.add(Conv2D(16, (3,3), activation='relu', padding='same', strides=2))\n",
    "image_model.add(Conv2D(32, (3,3), activation='relu', padding='same'))\n",
    "image_model.add(Conv2D(32, (3,3), activation='relu', padding='same', strides=2))\n",
    "image_model.add(Conv2D(64, (3,3), activation='relu', padding='same'))\n",
    "image_model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=2))\n",
    "image_model.add(Conv2D(128, (3,3), activation='relu', padding='same'))\n",
    "\n",
    "image_model.add(Flatten())\n",
    "image_model.add(Dense(1024, activation='relu'))\n",
    "image_model.add(Dropout(0.3))\n",
    "image_model.add(Dense(1024, activation='relu'))\n",
    "image_model.add(Dropout(0.3))\n",
    "\n",
    "image_model.add(RepeatVector(max_length))\n",
    "\n",
    "visual_input = Input(shape=(256, 256, 3,))\n",
    "encoded_image = image_model(visual_input)\n",
    "\n",
    "language_input = Input(shape=(max_length,))\n",
    "language_model = Embedding(vocab_size, 50, input_length=max_length, mask_zero=True)(language_input)\n",
    "language_model = LSTM(128, return_sequences=True)(language_model)\n",
    "language_model = LSTM(128, return_sequences=True)(language_model)\n",
    "\n",
    "#Create the decoder\n",
    "decoder = concatenate([encoded_image, language_model])\n",
    "decoder = LSTM(512, return_sequences=True)(decoder)\n",
    "decoder = LSTM(512, return_sequences=False)(decoder)\n",
    "decoder = Dense(vocab_size, activation='softmax')(decoder)\n",
    "\n",
    "# Compile the model\n",
    "model = Model(inputs=[visual_input, language_input], outputs=decoder)\n",
    "optimizer = RMSprop(lr=0.0001, clipvalue=1.0)\n",
    "model.compile(loss='categorical_crossentropy', optimizer=optimizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Save the model for every 2nd epoch\n",
    "filepath=\"org-weights-epoch-{epoch:04d}--val_loss-{val_loss:.4f}--loss-{loss:.4f}.hdf5\"\n",
    "checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, period=2)\n",
    "callbacks_list = [checkpoint]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the model\n",
    "model.fit([image_data, X], y, batch_size=64, shuffle=False, validation_split=0.1, callbacks=callbacks_list, verbose=1, epochs=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
